################################################################################
###Metabolomics - load and normalize data
################################################################################


################################################################################################################################################################
###define some functions                                                        ################################################################################

FattyTargets <- function(targets, location, sheet, batch){
                         if(missing(targets)){                                 stop("no targets specified")}
                         if(missing(location)){                                location <- ""}
                         if(missing(sheet)){                                    sheet <- 4}
                         if(missing(batch)){                                    batch <- 1:nrow(targets)}
                         if(sum(colnames(targets) =="filename") != 1){         stop("filename required")}

                         targets <- targets[batch,]
                         target.vector <- unique(as.character(targets$filename))
                         target.vector <- target.vector[is.na(target.vector)==F]

                         output <- NULL
                         for(i in 1:length(target.vector)){
                             ###Load data
                             tmp <- read.xlsx(paste(location,"/",target.vector[i],sep=""),sheet = sheet)
                             ###remove any columns rows consisting of only NA
                             tmp <- tmp[apply(is.na(tmp),1,sum) != ncol(tmp),apply(is.na(tmp),2,sum) != nrow(tmp)]
                             ###row 1 contains the sample names
                             #colnames(tmp) <- c("trait",tmp[1,-1])
                             tmp <- gather(tmp,key=Sample,value=Conc,-trait) %>%
                                    separate(Sample,into=c("strain","replicate"),sep="_") %>%
                                    separate(replicate,into=c("biological","technical"),sep="t")
                             tmp <- mutate(tmp,batch=i)
                             output <- rbind(output,tmp)
                         }
                         output$Conc <- as.numeric(as.character(unlist(output$Conc)))
                         return(output)
                        }


AminoTargets <- function(targets, location, sheet, batch){
                         if(missing(targets)){                                 stop("no targets specified")}
                         if(missing(location)){                                location <- ""}
                         if(missing(sheet)){                                    sheet <- 3}
                         if(missing(batch)){                                    batch <- 1:nrow(targets)}
                         if(sum(colnames(targets) =="filename") != 1){         stop("filename required")}

                         targets <- targets[batch,]
                         target.vector <- unique(as.character(targets$filename))
                         target.vector <- target.vector[is.na(target.vector)==F]

                        output <- NULL
                        for(i in 1:length(target.vector)){
                              ###Load data
                              tmp <- read.xlsx(paste(location,"/",target.vector[i],sep=""),sheet = sheet)
                              ###remove any columns rows consisting of only NA
                              tmp <- tmp[apply(is.na(tmp),1,sum) != ncol(tmp),apply(is.na(tmp),2,sum) != nrow(tmp)]
                              ###row 1 contains the sample names
                              tmp <- gather(tmp,key=Sample,value=Conc,-trait) %>%
                              separate(Sample,into=c("strain","replicate"),sep="_") %>%
                              separate(replicate,into=c("biological","technical"),sep="t")
                              tmp <- mutate(tmp,batch=i)
                              output <- rbind(output,tmp)
                          }
  output$Conc <- as.numeric(as.character(unlist(output$Conc)))
  return(output)
}


################################################################################################################################################################
###Load and normalize the data                                                  ################################################################################


    ###Data loading - Fatty acids
        targets <- read.delim(paste(workwd,"/Raw_data/Raw/FATargets.txt",sep="")); head(targets)
        data.fat <- FattyTargets(targets, location="Raw_data/Raw")

    ###Data loading - Amino acids
        targets <- read.delim(paste(workwd,"/Raw_data/Raw/AATargets.txt",sep="")); head(targets)
        data.aa <- AminoTargets(targets, location="Raw_data/Raw",sheet=3)

    ###Data loading - RILs information file
        RILs <- read.xlsx(paste(workwd,"/Raw_data/RILs.xlsx",sep="")) ##this file is to check the data from which strains are not reliable,
         ##reliable information is typed manually to the file RILs.xlsx by Jelmi. but this part of the info could be used later on to exclued unreliable data


    ###Filter based on technical reasons and unreliable data
    ###Remove batch 1 and batch 2
    ###Remove concentrations < 0.03 for fatty acids
    ###Remove concentrations < 0.4 for amino acids
        FAdata.proces <- merge(data.fat,RILs,by.x=c(2,6),by.y=c(1,2)) %>%
                         filter(Reliable == "yes"&batch!=1&batch!=2) %>%
                         mutate(trait = as.character(unlist(trait)),Traits_unreliable_AA=as.character(unlist(Traits_unreliable_AA)),Traits_unreliable_FA=as.character(unlist(Traits_unreliable_FA)),Conc=as.numeric(as.character(unlist(Conc)))) %>%
                         mutate(biological=ifelse(is.na(biological),"b1",biological),technical=ifelse(is.na(technical),"t1",technical)) %>%
                         group_by(batch,trait,strain,biological,technical) %>%
                         mutate(test=(grepl(trait,Traits_unreliable_AA) | grepl(trait,Traits_unreliable_FA)),test2=(!is.na(Traits_unreliable_AA) | !is.na(Traits_unreliable_FA))) %>%
                         data.frame() %>%
                         mutate(Conc=ifelse(test & test2,NA,Conc)) %>%
                         mutate(Conc=ifelse(!is.na(Conc) & Conc < 0.03,NA,Conc)) %>%
                         group_by(batch,trait,strain,biological) %>%
                         summarise(trait.mean=mean(Conc,na.rm=T)) %>%
                         data.frame() %>%
                         mutate(Data_type="Fat") #adding an extra column and name the FA data as fat




        AAdata.proces <- merge(data.aa,RILs,by.x=c(2,6),by.y=c(1,2)) %>%
                         filter(Reliable == "yes"&batch!=1&batch!=2) %>%
                         mutate(trait = as.character(unlist(trait)),Traits_unreliable_AA=as.character(unlist(Traits_unreliable_AA)),Traits_unreliable_FA=as.character(unlist(Traits_unreliable_FA)),Conc=as.numeric(as.character(unlist(Conc)))) %>%
                         mutate(biological=ifelse(is.na(biological),"b1",biological),technical=ifelse(is.na(technical),"t1",technical)) %>%
                         group_by(batch,trait,strain,biological,technical) %>%
                         mutate(test=(grepl(trait,Traits_unreliable_AA) | grepl(trait,Traits_unreliable_FA)),test2=(!is.na(Traits_unreliable_AA) | !is.na(Traits_unreliable_FA))) %>%
                         data.frame() %>%
                         mutate(Conc=ifelse(test & test2,NA,Conc)) %>%
                         mutate(Conc=ifelse(!is.na(Conc) & Conc < 0.4,NA,Conc)) %>%
                         group_by(batch,trait,strain,biological) %>%
                         summarise(trait.mean=mean(Conc,na.rm=T)) %>%
                         data.frame() %>%
                         mutate(Data_type="AA")


    ###Combine the files
        met.data <- rbind(FAdata.proces,AAdata.proces) %>%
                    mutate(strain_type=ifelse(strain == "N2","PL_N2",
                                         ifelse(strain == "CB4856","PL_CB4856",
                                           ifelse(grepl("WN",strain),"RIL",
                                             ifelse(grepl("ewir",strain),"IL_N2",
                                               ifelse(grepl("cbn",strain),"IL_CB4856",NA)))))) %>%
                   mutate(strain=gsub("ewir0","WN2",strain))



    ###Save data
        Metabolomics_raw_data <- met.data
        save(Metabolomics_raw_data,file=paste(workwd,"/Normalized_data/obj_Metabolomics_raw_data.Rdata",sep=""))
        write.table(met.data,file=paste(workwd,"/Normalized_data/Metabolomics_raw_data.txt",sep=""),sep="\t",quote=F)


################################################################################################################################################################
###Data transformations                                                         ################################################################################

    ###Load data
        load(file=paste(workwd,"/Normalized_data/obj_Metabolomics_raw_data.Rdata",sep=""))
        
        trait_names <- read.delim(paste(support_git_dir,"Trait_names.txt",sep=""))


    ###Add transformations
        ###Perc.total; the percentage of the total measured concentration within an individual (makes use of the fact that the measurement was done on the same sample
        ###zscore.trait; the z-score (x-mu)/sd calculated per trait, based on the statistics of the whole set
        ###zscore.trait.batch; the z-score calculated per trait, based on the statistics per batch
        ###ratio.trait; the log2 ratio with the mean per trait, log2(trait/mu)
        ###ratio.trait.batch; the log2 ratio with the mean per trait, log2(trait/mu), based on the statistics per batch

        ###Possibilities (for later): ratio with precursor, ratio aa/fat, ....
        allFAtraits <- c("C14:0","C15:0","C16:0","C17:0","C18:0","C19:0","C20:0","C21:0","C22:0","C14:1","C15:1","C16:1","C17:1","C18:1","C19:1","C20:1","C22:1","C18:3","C18:2","C20:5","C20:4","C20:3","C24:6")
        allAAtraits <- c("Phe","Tyr","Trp","Ala","Met","Gly","Val","Leu","Ile","Gln","Asn","Orn","Lys","Arg","Ser","Pro","Glu","Asp")
        alltraits <- c(allFAtraits,allAAtraits)


        Metabolomics_normalized_data <- group_by(Metabolomics_raw_data,batch,strain,Data_type,biological) %>%
                                        mutate(Perc.strain=trait.mean/sum(trait.mean,na.rm=T)) %>%
                                        data.frame() %>%
                                        group_by(trait) %>%
                                        mutate(means=mean(Perc.strain,na.rm=T),absmeans=mean(trait.mean,na.rm=T)) %>%
                                        data.frame() %>%
                                        group_by(trait,batch) %>%
                                        mutate(Perc.batch=(Perc.strain-(mean(Perc.strain,na.rm=T)-means)),Abs.batch=(trait.mean-(mean(trait.mean,na.rm=T)-absmeans))) %>%
                                        select(-means,-absmeans) %>%
                                        data.frame() %>%
                                        group_by(trait) %>%
                                        mutate(Perc.zscore=((Perc.batch-mean(Perc.batch,na.rm=T))/sd(Perc.batch,na.rm=T)),Abs.zscore=((Abs.batch-mean(Abs.batch,na.rm=T))/sd(Abs.batch,na.rm=T))) %>%
                                        data.frame() %>%
                                        rename(Abs.conc=trait.mean,metabolite=trait) %>%
                                        gather(key=trait_transformation,value=value,-c(batch,metabolite,strain,biological,Data_type,strain_type)) %>%
                                        mutate(trait=paste(metabolite,trait_transformation,sep=".")) %>%
                                        mutate(value=ifelse(is.na(value) | value == -Inf,NA,value)) %>%
                                        mutate(trait_transformation = factor(trait_transformation,levels=c("Abs.conc","Abs.batch","Abs.zscore","Perc.strain","Perc.batch","Perc.zscore"))) %>%
                                        group_by(trait_transformation,trait,strain) %>%
                                        mutate(upper=(mean(value,na.rm=T)+2*sd(value,na.rm=T)),lower=(mean(value,na.rm=T)-2*sd(value,na.rm=T))) %>%
                                        data.frame() %>%
                                        group_by(trait,strain,biological,batch,trait_transformation) %>%
                                        mutate(value=as.numeric(as.character(unlist(value))),upper=as.numeric(as.character(unlist(upper))),lower=as.numeric(as.character(unlist(lower)))) %>%
                                        mutate(outlier=!is.na(upper) & (value>upper | value < lower)) %>%
                                        data.frame() %>%
                                        mutate(value=ifelse(outlier,as.numeric(NA),as.numeric(value))) %>%
                                        data.frame() %>%
                                        merge(trait_names)

    ###Save transformations
        save(Metabolomics_normalized_data,file=paste(workwd,"/Normalized_data/obj_Metabolomics_normalized_data.Rdata",sep=""))
        write.table(Metabolomics_normalized_data,file=paste(workwd,"/Normalized_data/Metabolomics_normalized_data.txt",sep=""),sep="\t",quote=F)

    ###Save data @git
        save(Metabolomics_normalized_data,file=paste(support_git_dir,"/obj_Metabolomics_normalized_data.Rdata",sep=""))






################################################################################################################################################################
###Load and normalize the RNAi data                                            ################################################################################

    ###Data loading - Fatty acids
        targets <- read.delim(paste(workwd,"/Raw_data/Raw/FATargets_RNAi.txt",sep="")); head(targets)
        data.fat <- FattyTargets(targets, location="Raw_data/Raw")

        
    ###Data loading - RILs information file
        RNAis <- read.xlsx(paste(workwd,"/Raw_data/RNAis.xlsx",sep="")) ##this file is to check the data from which strains are not reliable,
         ##reliable information is typed manually to the file RILs.xlsx by Jelmi. but this part of the info could be used later on to exclued unreliable data


    ###Filter based on technical reasons and unreliable data
    ###Remove batch 1 and batch 2
    ###Remove concentrations < 0.03 for fatty acids
    ###Remove concentrations < 0.4 for amino acids
        FAdata.proces <- merge(data.fat,RNAis,by.x=c(2,3,6),by.y=c(1,8,2)) %>%
                         filter(Reliable == "yes") %>%
                         mutate(trait = as.character(unlist(trait)),Conc=as.numeric(as.character(unlist(Conc)))) %>%
                         mutate(biological=ifelse(is.na(biological),"b1",biological),technical=ifelse(is.na(technical),"t1",technical)) %>%
                         mutate(Conc=ifelse(!is.na(Conc) & Conc < 0.03,NA,Conc)) %>%
                         group_by(batch,trait,strain,biological) %>%
                         summarise(trait.mean=mean(Conc,na.rm=T)) %>%
                         data.frame() %>%
                         mutate(Data_type="Fat") #adding an extra column and name the FA data as fat

        trait_names <- read.delim(paste(support_git_dir,"Trait_names.txt",sep=""))


    ###Add transformations
        ###Perc.total; the percentage of the total measured concentration within an individual (makes use of the fact that the measurement was done on the same sample
        ###zscore.trait; the z-score (x-mu)/sd calculated per trait, based on the statistics of the whole set
        ###zscore.trait.batch; the z-score calculated per trait, based on the statistics per batch
        ###ratio.trait; the log2 ratio with the mean per trait, log2(trait/mu)
        ###ratio.trait.batch; the log2 ratio with the mean per trait, log2(trait/mu), based on the statistics per batch

        ###Possibilities (for later): ratio with precursor, ratio aa/fat, ....
        allFAtraits <- c("C14:0","C15:0","C16:0","C17:0","C18:0","C19:0","C20:0","C21:0","C22:0","C14:1","C15:1","C16:1","C17:1","C18:1","C19:1","C20:1","C22:1","C18:3","C18:2","C20:5","C20:4","C20:3","C24:6")
        allAAtraits <- c("Phe","Tyr","Trp","Ala","Met","Gly","Val","Leu","Ile","Gln","Asn","Orn","Lys","Arg","Ser","Pro","Glu","Asp")
        alltraits <- c(allFAtraits,allAAtraits)


        Metabolomics_normalized_RNAi <- group_by(FAdata.proces,batch,strain,Data_type,biological) %>%
                                        mutate(Perc.strain=trait.mean/sum(trait.mean,na.rm=T)) %>%
                                        data.frame() %>%
                                        group_by(trait) %>%
                                        mutate(means=mean(Perc.strain,na.rm=T),absmeans=mean(trait.mean,na.rm=T)) %>%
                                        data.frame() %>%
                                        group_by(trait,batch) %>%
                                        mutate(Perc.batch=(Perc.strain-(mean(Perc.strain,na.rm=T)-means)),Abs.batch=(trait.mean-(mean(trait.mean,na.rm=T)-absmeans))) %>%
                                        select(-means,-absmeans) %>%
                                        data.frame() %>%
                                        group_by(trait) %>%
                                        mutate(Perc.zscore=((Perc.batch-mean(Perc.batch,na.rm=T))/sd(Perc.batch,na.rm=T)),Abs.zscore=((Abs.batch-mean(Abs.batch,na.rm=T))/sd(Abs.batch,na.rm=T))) %>%
                                        data.frame() %>%
                                        rename(Abs.conc=trait.mean,metabolite=trait) %>%
                                        gather(key=trait_transformation,value=value,-c(batch,metabolite,strain,biological,Data_type)) %>%
                                        mutate(trait=paste(metabolite,trait_transformation,sep=".")) %>%
                                        mutate(value=ifelse(is.na(value) | value == -Inf,NA,value)) %>%
                                        mutate(trait_transformation = factor(trait_transformation,levels=c("Abs.conc","Abs.batch","Abs.zscore","Perc.strain","Perc.batch","Perc.zscore"))) %>%
                                        group_by(trait_transformation,trait,strain) %>%
                                        mutate(upper=(mean(value,na.rm=T)+2*sd(value,na.rm=T)),lower=(mean(value,na.rm=T)-2*sd(value,na.rm=T))) %>%
                                        data.frame() %>%
                                        group_by(trait,strain,biological,batch,trait_transformation) %>%
                                        mutate(value=as.numeric(as.character(unlist(value))),upper=as.numeric(as.character(unlist(upper))),lower=as.numeric(as.character(unlist(lower)))) %>%
                                        mutate(outlier=!is.na(upper) & (value>upper | value < lower)) %>%
                                        data.frame() %>%
                                        mutate(value=ifelse(outlier,as.numeric(NA),as.numeric(value))) %>%
                                        data.frame() %>%
                                        merge(trait_names)


    ###Save transformations
        save(Metabolomics_normalized_RNAi,file=paste(workwd,"/Normalized_data/obj_Metabolomics_normalized_RNAi.Rdata",sep=""))
        write.table(Metabolomics_normalized_RNAi,file=paste(workwd,"/Normalized_data/Metabolomics_normalized_RNAi.txt",sep=""),sep="\t",quote=F)

    ###Save data @git
        save(Metabolomics_normalized_RNAi,file=paste(support_git_dir,"/obj_Metabolomics_normalized_RNAi.Rdata",sep=""))









